Add sqlite reader and adjust SQL queries to work there#182
Add sqlite reader and adjust SQL queries to work there#182georgestagg merged 13 commits intomainfrom
Conversation
| default = ["duckdb", "sqlite", "vegalite", "ipc", "builtin-data"] | ||
| default = ["duckdb", "sqlite", "vegalite", "ipc", "parquet", "builtin-data"] | ||
| ipc = ["polars/ipc"] | ||
| duckdb = ["dep:duckdb", "dep:arrow"] | ||
| polars-sql = ["polars/sql"] | ||
| builtin-data = ["polars/parquet"] | ||
| parquet = ["polars/parquet"] | ||
| postgres = ["dep:postgres"] | ||
| sqlite = ["dep:rusqlite"] | ||
| vegalite = [] | ||
| ggplot2 = [] | ||
| builtin-data = [] |
There was a problem hiding this comment.
This tweak to features is just prep for Wasm, not related to sqlite.
|
Some more context: I almost added a new A question arises of if there are performance considerations to reach for the lowest common denominator implementations. I had Claude whip up a benchmark, and these are the results: Detailsuse criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use duckdb::Connection;
use ggsql::utils::{sql_generate_series, sql_percentile};
fn setup_connection() -> Connection {
Connection::open_in_memory().expect("Failed to open DuckDB in-memory connection")
}
fn create_data_table(conn: &Connection, name: &str, n_rows: usize) {
conn.execute_batch(&format!(
"CREATE OR REPLACE TABLE {name} AS \
SELECT random() * 1000.0 AS val \
FROM GENERATE_SERIES(0, {}) AS seq(n)",
n_rows - 1
))
.expect("Failed to create data table");
}
fn bench_generate_series(c: &mut Criterion) {
let conn = setup_connection();
let mut group = c.benchmark_group("generate_series");
for n in [64, 512, 1000, 4096] {
group.bench_with_input(BenchmarkId::new("native", n), &n, |b, &n| {
let sql = format!(
"SELECT n FROM GENERATE_SERIES(0, {}) AS seq(n)",
n - 1
);
b.iter(|| {
let mut stmt = conn.prepare(&sql).unwrap();
let rows = stmt.query_map([], |row| row.get::<_, f64>(0)).unwrap();
for r in rows {
std::hint::black_box(r.unwrap());
}
});
});
group.bench_with_input(BenchmarkId::new("portable", n), &n, |b, &n| {
let cte = sql_generate_series(n);
let sql = format!("WITH RECURSIVE {cte} SELECT n FROM __ggsql_seq__");
b.iter(|| {
let mut stmt = conn.prepare(&sql).unwrap();
let rows = stmt.query_map([], |row| row.get::<_, f64>(0)).unwrap();
for r in rows {
std::hint::black_box(r.unwrap());
}
});
});
}
group.finish();
}
fn bench_percentile(c: &mut Criterion) {
let conn = setup_connection();
let mut group = c.benchmark_group("percentile");
for n_rows in [100, 1000, 10000] {
let table = format!("data_{n_rows}");
create_data_table(&conn, &table, n_rows);
group.bench_with_input(BenchmarkId::new("native", n_rows), &n_rows, |b, _| {
let sql = format!(
"SELECT QUANTILE_CONT(val, 0.25) AS q1, QUANTILE_CONT(val, 0.75) AS q3 \
FROM {table}"
);
b.iter(|| {
let mut stmt = conn.prepare(&sql).unwrap();
let row = stmt
.query_row([], |row| {
Ok((row.get::<_, f64>(0)?, row.get::<_, f64>(1)?))
})
.unwrap();
std::hint::black_box(row);
});
});
group.bench_with_input(BenchmarkId::new("portable", n_rows), &n_rows, |b, _| {
let from = format!("SELECT * FROM {table}");
let q1 = sql_percentile("val", 0.25, &from, &[]);
let q3 = sql_percentile("val", 0.75, &from, &[]);
let sql = format!("SELECT {q1} AS q1, {q3} AS q3");
b.iter(|| {
let mut stmt = conn.prepare(&sql).unwrap();
let row = stmt
.query_row([], |row| {
Ok((row.get::<_, f64>(0)?, row.get::<_, f64>(1)?))
})
.unwrap();
std::hint::black_box(row);
});
});
}
group.finish();
}
criterion_group!(benches, bench_generate_series, bench_percentile);
criterion_main!(benches); |
|
Just a quick comment to note b3d6b49 reintroduced |
thomasp85
left a comment
There was a problem hiding this comment.
To the extend of my abilities this looks good. There are a few comments but if they have good answers they shouldn't stop you from merging
src/plot/layer/geom/histogram.rs
Outdated
| // where MAX would be interpreted as the aggregate function | ||
| format!( | ||
| "(GREATEST(CEIL(({x} - {min} + {w} * 0.5) / {w}) - 1, 0)) * {w} + {min} - {w} * 0.5", | ||
| "(CASE WHEN CEIL(({x} - {min} + {w} * 0.5) / {w}) - 1 > 0 THEN CEIL(({x} - {min} + {w} * 0.5) / {w}) - 1 ELSE 0 END) * {w} + {min} - {w} * 0.5", |
There was a problem hiding this comment.
Any reason we are not using the dialect here?
There was a problem hiding this comment.
Good spot! I just missed it when adding back SqlDialect.
|
all in all I really like the Dialect approach - I'm sure it will come in handy as we expand our reader support |
I went back and forth many times for this PR on whether to introduce SQL engine specific syntax for percentiles and other incompatibilities. In the end, I decided not to, instead opting to try and use as SQL-agnostic code as I could throughout.
Various things don't work in sqlite, here is the situation as it currently stands as far as I recall:
No
EXCLUDE. Instead we keep duplicated columns during the affected queries and drop them from the result.LIMIT 0does not return the correct column type information - Switch toLIMIT 1.No
GREATESTorLEAST- Switch to a utility function to build the equivalent withCASE WHEN.No
QUANTILE_CONTorPERCENTILE_CONT- Return back to using anNTILE-based method for boxplot and density.No
ANY_VALUE- We can useMIN, that gives us an any value.No
GENERATE_SERIES- Use aRECURSIVECTE to generate the series values.Closes #134